##  sort.texts.pl
##
##  This takes the summary.* and texts.* output from merge.LN.text.pl, sorts the summary, and then 
##  writes the cases in dyad-date order with headers for each new dyad. Stories are located using  
##  the unique story identifier.
##
##  TO RUN PROGRAM:
##
##  perl sort.texts.pl suffix
##
##  where prefix is the prefix for the files "summary.$ARGV[0]" and "texts.$ARGV[0]"
##
##  INPUT FILE:
##  the aforementioned output files from merge.LN.text.pl.
##               
##  OUTPUT FILES:
## 
##  "summary.sorted.$ARGV[0]": sorted version of the file
##
##  ">sorted.$ARGV[0].txt": Stories in dyad-date order with headers
##
##  PROGRAMMING NOTES:
##  None
##
##  SYSTEM REQUIREMENTS
##  This program has been successfully run under Mac OS 10.5; it is standard perl
##  so it should also run in Unix or Windows. 
##
##  PROVENANCE:
##  Programmer: Philip A. Schrodt
##              Dept of Political Science
##              Pennsylvania State University
##              227 Pond Laboratory
##	            University Park, PA, 16802 U.S.A.
##	            http://eventdata.psu.edu
##
##	Copyright (c) 2010  Philip A. Schrodt.  All rights reserved.
##
## 	Redistribution and use in source and binary forms, with or without modification,
## 	are permitted under the terms of the GNU General Public License:
## 	http://www.opensource.org/licenses/gpl-license.html
##
##  Programming supported by National Science Foundation Political Science Program Grant
##  SES-0719634 "Improving the Efficiency of Militarized Interstate Dispute Data Collection using 
##  Automated Textual Analysis" and SES-0924240, "MID4: Updating the Militarized Dispute Data Set 
##  2002-2010." 
##
##	Report bugs to: schrodt@psu.edu
##
##	For plausible indenting of this source code, set the tab size in your editor to "2"
##
##  REVISION HISTORY:
##  26-July-08:  Initial version
##  16-Dec-09 :  Command line, assorted new formatting
##  02-Mar-10 :  Adjustment for non-unique file IDs
##  31-Mar-10 :  Sorts the summary file
##
##  ----------------------------------------------------------------------------------

#!/usr/local/bin/perl

if (length($ARGV[0]) < 4) {
  print "File suffix is required to run the program\n";
  exit;
}

if (!(-e "summary.$ARGV[0]"))  {
	print "Can\'t find the file \"summary.$ARGV[0]\"\nExiting...\n";
	exit;
}

print "Sorting \"summary.$ARGV[0]\"\n";
system("sort summary.$ARGV[0] > summary.sorted.$ARGV[0]");

open(FSUM,"summary.sorted.$ARGV[0]")  or die "Can\'t open FSUM output file; error $!";
open(FOUT,">sorted.$ARGV[0].txt")  or die "Can\'t open FOUT output file; error $!";

$oldyad ="";
while ($target = <FSUM>) {
	@fields = split(/\t/,$target);
	$tarstr = $fields[3];
	$dyad = "$fields[0]-$fields[1]";
	if ($dyad ne $oldyad) {
		print FOUT "\n========================================================================================\n";
		for ($ka=0; $ka<8; $ka++){ print FOUT "  $dyad  ";}
		print FOUT "\n========================================================================================\n*****\n\n";
		$oldyad = $dyad;
	}
	print "$tarstr:  $dyad\n";
	open(FIN,"texts.$ARGV[0]")  or die "Can\'t open FIN input file; error $!";
	while ($line = <FIN>) {
		if ($line =~ m/$tarstr/) {
			$line2 = <FIN>;
			if ($line2 !~ m/$dyad/) { next; } # gets around the non-unique ID problem
			# transfer the header
			print FOUT $line,$line2;
			$line = <FIN>;
			print FOUT $line;
			print FOUT "News source: $fields[5]\n";
			print FOUT "SVM score: $fields[6]\n";

			$line = <FIN>;
			until ($line =~ m/--------------------------------/) {  # print the rest of the story
				$line = <FIN>;
				print FOUT $line;
				if (eof(FIN)) {last;}
			}
			last;
		}
	}
	close(FIN) or die "Can\'t close FIN input file ; error $!";
}
			
close(FOUT) or die "Can\'t close FOUT output file ; error $!";
close(FSUM) or die "Can\'t close FSUM input file ; error $!";
print "\nProgram has finished!\n";
